%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
return false;
}
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import sys
import io
import os
import pickle
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from IPython.display import SVG
from sklearn.model_selection import train_test_split
import gensim.downloader as api
#wv = api.load('word2vec-google-news-300')
### Matplotlib
import matplotlib.pyplot as plt
from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl
from matplotlib.colors import ListedColormap
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
dark2_colors = ['#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02','#a6761d','#666666']
def set_mpl_params():
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 12
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
#rcParams['font.family'] = 'StixGeneral'
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['DejaVu Sans']
mpl.rcdefaults()
set_mpl_params()
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
%matplotlib inline
import rdkit as rdk
from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem import QED
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import warnings
warnings.filterwarnings('ignore')
from mol2vec.features import mol2alt_sentence, MolSentence, DfVec, sentences2vec
from mol2vec.helpers import depict_identifier, plot_2D_vectors, IdentifierTable, mol_to_svg
def mol2sentence(mol, radius):
"""Calculates ECFP (Morgan fingerprint) and returns identifiers of substructures as 'sentence' (string).
Returns a tuple with 1) a list with sentence for each radius and 2) a sentence with identifiers from all radii
combined.
NOTE: Words are ALWAYS reordered according to atom order in the input mol object.
NOTE: Due to the way how Morgan FPs are generated, number of identifiers at each radius is smaller
Parameters
----------
mol : rdkit.Chem.rdchem.Mol
radius : float
Fingerprint radius
Returns
-------
identifier sentence
List with sentences for each radius
alternating sentence
Sentence (list) with identifiers from all radii combined
"""
radii = list(range(int(radius) + 1))
info = {}
_ = AllChem.GetMorganFingerprint(mol, radius, bitInfo=info) # info: dictionary identifier, atom_idx, radius
mol_atoms = [a.GetIdx() for a in mol.GetAtoms()]
dict_atoms = {x: {r: None for r in radii} for x in mol_atoms}
for element in info:
for atom_idx, radius_at in info[element]:
dict_atoms[atom_idx][radius_at] = element # {atom number: {fp radius: identifier}}
# iterate over all atoms and radii
identifier_sentences = []
for r in radii: # iterate over radii to get one sentence per radius
identifiers = []
for atom in dict_atoms: # iterate over atoms
# get one sentence per radius
identifiers.append(dict_atoms[atom][r])
identifier_sentences.append(list(map(str, [x for x in identifiers if x])))
# merge identifiers alternating radius to sentence: atom 0 radius0, atom 0 radius 1, etc.
identifiers_alt = []
for atom in dict_atoms: # iterate over atoms
for r in radii: # iterate over radii
identifiers_alt.append(dict_atoms[atom][r])
alternating_sentence = map(str, [x for x in identifiers_alt if x])
return list(identifier_sentences), list(alternating_sentence)
def sentences2vec(sentences, model, unseen=None):
"""Generate vectors for each sentence (list) in a list of sentences. Vector is simply a
sum of vectors for individual words.
Parameters
----------
sentences : list, array
List with sentences
model : word2vec.Word2Vec
Gensim word2vec model
unseen : None, str
Keyword for unseen words. If None, those words are skipped.
https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032
Returns
-------
np.array
"""
keys = set(model.wv.vocab.keys())
vec = []
if unseen:
unseen_vec = model.wv.word_vec(unseen)
for sentence in sentences:
if unseen:
vec.append(sum([model.wv.word_vec(y) if y in set(sentence) & keys
else unseen_vec for y in sentence]))
else:
vec.append(sum([model.wv.word_vec(y) for y in sentence
if y in set(sentence) & keys]))
return np.array(vec)
def featurize(df, out_file, model_path, r, uncommon=None):
"""Featurize mols in a Pandas dataframe.
SMILES are regenerated with RDKit to get canonical SMILES without chirality information.
Parameters
----------
df : dataframe
Input Panda dataframe
out_file : str
Output csv
model_path : str
File path to pre-trained Gensim word2vec model
r : int
Radius of morgan fingerprint
uncommon : str
String to used to replace uncommon words/identifiers while training. Vector obtained for 'uncommon' will be used
to encode new (unseen) identifiers
Returns
-------
"""
# Load the model
word2vec_model = Word2Vec.load(model_path)
if uncommon:
try:
word2vec_model[uncommon]
except KeyError:
raise KeyError('Selected word for uncommon: %s not in vocabulary' % uncommon)
print('Loading molecules.')
df['ROMol'] = df.apply(lambda x: Chem.MolFromSmiles(str(x['Smiles'])), axis=1)
print("Keeping only molecules that can be processed by RDKit.")
df = df[df['ROMol'].notnull()]
df['QED'] = df.apply(lambda x: QED.qed(x['ROMol']), axis=1)
df['Smiles'] = df['ROMol'].map(Chem.MolToSmiles) # Recreate SMILES
print('Featurizing molecules.')
df['mol-sentence'] = df.apply(lambda x: MolSentence(mol2sentence(x['ROMol'], r)[1]), axis=1)
vectors = sentences2vec(df['mol-sentence'], word2vec_model, unseen=uncommon)
print(vectors.shape)
df_vec = pd.DataFrame(vectors, columns=['mol2vec-%03i' % x for x in range(vectors.shape[1])])
df_vec.index = df.index
df = df.join(df_vec)
df.drop(['ROMol', 'mol-sentence'], axis=1).to_csv(out_file)
return vectors, df
in_file = './BA-NIHMS.csv'
df_pos = pd.read_csv(in_file, delimiter=',', usecols=[0, 1, 2, 3], names=['ID', 'Name', 'Smiles', 'PercentF'], header=0, encoding='latin-1') # Assume <tab> separated
df_pos.describe()
df_pos
in_file = './non-BA-NIHMS.csv'
df_neg = pd.read_csv(in_file, delimiter=',', usecols=[0, 1, 2, 3], names=['ID', 'Name', 'Smiles', 'PercentF'], header = 0, encoding='latin-1') # Assume <tab> separated
df_neg.describe()
df_neg['Class'] = 0
print(df_neg.columns)
df_pos['Class'] = 1
print(df_pos.columns)
df = pd.concat([df_neg, df_pos])
df.reset_index(drop=True, inplace=True)
df.describe()
model_path = './models/model_300dim.pkl'
out_file = 'BA-vectors.csv'
X, df = featurize(df, out_file, model_path, 2, uncommon='UNK')
df['QED'].describe()
import seaborn as sns
sns.set(rc={'figure.figsize':(16,16)})
palette = sns.color_palette("bright",2)
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
import umap.umap_ as UMAP
#pca_model = PCA(n_components=30)
scaler = StandardScaler()
pca_model = PCA(n_components=2)
umap_model = UMAP.UMAP()
y = df['Class']
predY = df['PercentF'].values / 100.
# calculate the variance explained by the PC analysis
pca = PCA(n_components=4).fit(scaler.fit_transform(X))
var_exp = pca.explained_variance_ratio_.cumsum()*100.
print ('The 1st Principal Component explains {:03.1f} % of the variance\n'.format(var_exp[0]))
print ('The 1st and 2nd Principal Components explain {:03.1f} % of the variance\n'.format(var_exp[1]))
print ('The 1st, 2nd and 3rd Principal Components explain {:03.1f} % of the variance\n'.format(var_exp[2]))
print ('The first four Principal Components explain {:03.1f} % of the variance\n'.format(var_exp[3]))
tsne_model = TSNE(n_components=2, random_state=20, perplexity=200, n_iter=1000, metric='cosine')
tsne = tsne_model.fit_transform(X)
pca = pca_model.fit_transform(scaler.fit_transform(X))
umap = umap_model.fit_transform(scaler.fit_transform(X))
df_vec = pd.DataFrame()
df_vec['identifier'] = list([str(x) for x in df['ID'].values.tolist()])
df_vec.index = df_vec['identifier']
df_vec['t-SNE-c1'] = tsne.T[0]
df_vec['t-SNE-c2'] = tsne.T[1]
df_vec['PCA-c1'] = pca.T[0]
df_vec['PCA-c2'] = pca.T[1]
df_vec['UMAP-c1'] = umap.T[0]
df_vec['UMAP-c2'] = umap.T[1]
df_vec['Class'] = ['BioAvailable' if x == 1
else 'Not BioAvailable' for x in df['Class'].tolist()]
df_vec.head(3)
sns.scatterplot('t-SNE-c1','t-SNE-c2', hue='Class', data=df_vec, legend='full', palette=palette)
plt.savefig('NIHMS_tSNE_byBioAvalClass.png')
sns.scatterplot('PCA-c1','PCA-c2', hue='Class', data=df_vec, legend='full', palette=palette)
plt.savefig('NIHMS_PCA_byBioAvalClass.png')
sns.scatterplot('UMAP-c1','UMAP-c2', hue='Class', data=df_vec, legend='full', palette=palette)
plt.savefig('NIHMS_UMAP_byBioAvalClass.png')
from optparse import OptionParser
from time import time
import sklearn
from sklearn import svm, datasets
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection, metrics
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score, roc_auc_score
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, StratifiedShuffleSplit
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler, label_binarize, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model, svm
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, RandomTreesEmbedding
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn import datasets, feature_selection, cluster, feature_extraction
from sklearn import neighbors, decomposition, metrics
from sklearn import decomposition, feature_selection
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.utils.extmath import density
import json #
import datetime as dt # module for manipulating dates and times
from datetime import datetime
import itertools
from collections import Counter
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.utils import shuffle
RANDOM_STATE = 458
TEST_SIZE = 0.2
X, y, predY = shuffle(X, y, predY, random_state=RANDOM_STATE)
X_train, X_test, y_train, y_test, predY_train, predY_test = train_test_split(X,y,predY, test_size=TEST_SIZE, random_state=RANDOM_STATE)
X_dev, X_test, y_dev, y_test, predY_dev, predY_test = train_test_split(X_test, y_test, predY_test, test_size=0.5, random_state=RANDOM_STATE)
print (len(X_train))
print (len(X_dev))
print (len(X_test))
def log_ratio(x, eps):
return np.log(x+eps/(1-x+eps))
def softmax(x):
"""Compute softmax values for each sets of scores in x."""
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
scores = [0., 1.0]
print(softmax(scores))
## Benchmark the classifiers, one at a time
def log_ratio(x, eps):
return np.log(x+eps/(1-x+eps))
def benchmark(clf, name, X_train, X_dev, y_train, y_dev, predY_dev):
"""
clf - the classifier
name - its name
benchmark: to create the benchmark metrics for the classification
returns: the inputs to the results list
"""
eps = 1e-6
print('_' * 80)
print("Training: ")
print(clf)
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)
t0 = time()
#if ('GBT Classifier' in name) :
# pred = clf.predict(X_test.toarray())
#else:
pred = clf.predict(X_dev)
test_time = time() - t0
print("test time: %0.3fs" % test_time)
acc_score = metrics.accuracy_score(y_dev, pred)
print("accuracy: %0.3f" % acc_score)
f1_macro_score = metrics.f1_score(y_dev, pred, average='macro')
print("*** F1 macro avg score: %0.3f" % f1_macro_score)
y_score = clf.predict_proba(X_dev)[:,1]
print("predict proba function")
auc_score = metrics.roc_auc_score(y_dev, y_score)
print("*** AUC for ROC = %0.3f\n" % auc_score)
print("classification report:")
print(metrics.classification_report(y_dev, pred,
target_names=categories))
conf_mat = metrics.confusion_matrix(y_dev, pred)
print("confusion matrix:")
print(conf_mat)
tn, fp, fn, tp = conf_mat.ravel()
sensitivity = tp / (tp+fn) *100.
print("\nsensitivity / recall (TPR):")
print(np.round(sensitivity,3))
specificity = tn / (tn+fp) *100.
print("specificity (TNR):")
print(np.round(specificity,3))
CCR = ( sensitivity + specificity) / 2
print("Correct classification rate (CCR); balanced accuracy:")
print(np.round(CCR,3))
ybar = np.sum(predY_dev)/len(predY_dev)
ssreg = np.sum((y_score-ybar)**2)
sserr = np.sum((predY_dev - y_score)**2)
sstot = np.sum((predY_dev - ybar)**2)
R2 = 1. - sserr / sstot
R2 = metrics.r2_score(predY_dev,y_score)
print("R-squared value %F:")
#print(y_score)
print(np.round(R2,3))
mae = metrics.mean_absolute_error(predY_dev,y_score)
print("MAE value %F:")
print(np.round(mae,3))
R2 = metrics.r2_score(log_ratio(predY_dev, eps), log_ratio(y_score, eps))
print("R-squared value logK(%F):")
#print(y_score)
print(np.round(R2,3))
mae = metrics.mean_absolute_error(log_ratio(predY_dev, eps), log_ratio(y_score, eps))
print("MAE value logK(%F):")
print(np.round(mae,3))
print()
clf_descr = str(clf).split('(')[0]
return name, acc_score, f1_macro_score, auc_score, train_time, test_time
def create_results(X_train, X_dev, y_train, y_dev, predY_dev):
"""
create_results: to run the classification and create the of results
from the battery of classifiers
returns: an multiD list of results
"""
results = []
for clf, name in (
(KNeighborsClassifier(n_neighbors=20), "k Nearest Neighbors | k=20"),
(KNeighborsClassifier(n_neighbors=30), "k Nearest Neighbors | k=30"),
(BernoulliNB(alpha=.01),"Bernouilli Naive Bayes"),
(DecisionTreeClassifier(max_depth=3, random_state=RANDOM_STATE), "Decision Tree | MaxDepth=3"),
(DecisionTreeClassifier(max_depth=7, random_state=RANDOM_STATE), "Decision Tree | MaxDepth=7"),
(BaggingClassifier(n_estimators=10, random_state=RANDOM_STATE), "Bagging | 10 trees"),
(BaggingClassifier(n_estimators=40, max_samples=1.0, max_features=0.4, random_state=RANDOM_STATE), "Bagging | 40 trees 40% max features"),
(AdaBoostClassifier(n_estimators=10, random_state=RANDOM_STATE), "AdaBoost | 10 trees"),
(AdaBoostClassifier(n_estimators=40, random_state=RANDOM_STATE), "AdaBoost | 40 trees"),
(GradientBoostingClassifier(n_estimators=40, learning_rate=1.0, max_depth=1, random_state=RANDOM_STATE) , "Gradient Boosting | 40 trees"),
(GradientBoostingClassifier(n_estimators=40, learning_rate=0.7, max_depth=10, random_state=RANDOM_STATE), "Gradient Boosting | 40 trees, max-depth=10"),
(RandomForestClassifier(n_estimators=100, max_features='auto', random_state=RANDOM_STATE), "Random Forest | 100 trees"),
(RandomForestClassifier(n_estimators=500, max_features='auto', random_state=RANDOM_STATE), "Random forest | 500 trees"),
):
print('=' * 80)
print(name)
results.append(benchmark(clf, name, X_train, X_dev, y_train, y_dev, predY_dev))
'''
for penalty in ["L1", "L2"]:
print('=' * 80)
print("%s penalty" % penalty.upper())
# Train SGD model
results.append(benchmark(SGDClassifier(loss="hinge", alpha=1e-4, max_iter=1000,
penalty=penalty, random_state=RANDOM_STATE), "SGD classifier | hinge loss, %s penalty, alpha=1e-4" %penalty,X_train, X_dev, y_train, y_dev, predY_dev))
results.append(benchmark(SGDClassifier(loss="hinge", alpha=1e-6, max_iter=1000,
penalty=penalty, random_state=RANDOM_STATE), "SGD classifier | hinge loss %s penalty, alpha=1e-6" %penalty,X_train, X_dev, y_train, y_dev, predY_dev))
results.append(benchmark(SGDClassifier(loss="log", alpha=.0001, max_iter=1000,
penalty=penalty, random_state=RANDOM_STATE), "SGD classifier | log loss %s penalty" %penalty,X_train, X_dev, y_train, y_dev, predY_dev))
# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=10000,
penalty="elasticnet", random_state=RANDOM_STATE),"SGD classifier | Elastic-Net penalty",X_train, X_dev, y_train, y_dev, predY_dev))
'''
return results
def comparison_plots(results):
"""
results - array containing the results from the classification to plot
yields: prints out the results from each classifier and then finishes with plots of the
accuracy scores and ROC AUC scores for all the classifiers
"""
######
# Plotting logistics
indices = np.arange(len(results))
results = [[x[i] for x in results] for i in range(len(results[0]))]
clf_names, acc_score, f1_macro, auc_score, training_time, test_time = results
data_tuples = list(zip(clf_names,acc_score, f1_macro, auc_score ))
dataframe_to_plot = pd.DataFrame(data_tuples, columns=['Classifier','Accuracy Score', 'F1 Macro-Avg Score', 'AUC of ROC curve'])
sns.set(style="whitegrid")
qualitative_colors = sns.color_palette("Set2", 10)
sns.set_palette(qualitative_colors)
sns.set(rc={'figure.figsize':(12,16)})
ax = sns.barplot(x='Accuracy Score', y='Classifier', data=dataframe_to_plot, palette = qualitative_colors)
ax.set_yticklabels('')
for i, pos in enumerate(indices):
ax.annotate(clf_names[i], (0.02, pos+0.2))
for i, pos in enumerate(indices):
ax.annotate(str(round(acc_score[i]*100,2))+'%', (acc_score[i]-0.07, pos+0.2))
plt.show()
ax = sns.barplot(x='F1 Macro-Avg Score', y='Classifier', data=dataframe_to_plot, palette = qualitative_colors)
ax.set_yticklabels('')
for i, pos in enumerate(indices):
ax.annotate(clf_names[i], (0.02, pos+0.2))
for i, pos in enumerate(indices):
ax.annotate(str(round(f1_macro[i]*100,2))+'%', (f1_macro[i]-0.07, pos+0.2))
plt.show()
ax = sns.barplot(x='AUC of ROC curve', y='Classifier', data=dataframe_to_plot, palette = qualitative_colors)
ax.set_yticklabels('')
for i, pos in enumerate(indices):
ax.annotate(clf_names[i], (0.02, pos+0.2))
for i, pos in enumerate(indices):
ax.annotate(str(round(auc_score[i]*100,2))+'%', (auc_score[i]-0.07, pos+0.2))
plt.show()
# options
print_top10 = True
n_features = 2 ** 16
filtered = True
target_names = ['pos', 'neg']
categories = ['Postive', 'Negative']
results1 = create_results(X_train, X_dev, y_train, y_dev, predY_dev)
comparison_plots(results1)
def compare_trees(X_train, X_test, y_train, y_test):
""""
X_train - training set data features
X_test - validation set data features
y_train - training set data labels
y_test - validation set truth
compare_trees: function to run combo tree-based classifiers with Logistic Regresion
and to plot comparable ROC curves for them
yields: plots of the ROC curves and AUC scores
"""
n_estimator = 40
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5)
# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(n_estimators=40, max_depth=1, random_state=RANDOM_STATE)
rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)
# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=200, random_state=RANDOM_STATE)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
grd = GradientBoostingClassifier(n_estimators=40, learning_rate=1.0, max_depth=1, random_state=RANDOM_STATE)
grd_enc = OneHotEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
y_pred_grd_lm = grd_lm.predict_proba(
grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
# The gradient boosted model by itself
y_pred_grd = grd.predict_proba(np.asarray(X_test))[:, 1]
fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)
# The random forest model by itself
y_pred_rf = rf.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
plt.figure(1)
plt.figure(figsize=(10,8))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR (area = {0:0.3f})'
''.format(auc(fpr_rt_lm, tpr_rt_lm)))
plt.plot(fpr_rf, tpr_rf, label='RF (area = {0:0.3f})'
''.format(auc(fpr_rf, tpr_rf)))
plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR (area = {0:0.3f})'
''.format(auc(fpr_rf_lm, tpr_rf_lm)))
plt.plot(fpr_grd, tpr_grd, label='GBT (area = {0:0.3f})'
''.format(auc(fpr_grd, tpr_grd)))
plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR (area = {0:0.3f})'
''.format(auc(fpr_grd_lm, tpr_grd_lm)))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
plt.figure(2)
plt.figure(figsize=(10,8))
plt.xlim(0, 0.4)
plt.ylim(0.6, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR (area = {0:0.3f})'
''.format(auc(fpr_rt_lm, tpr_rt_lm)))
plt.plot(fpr_rf, tpr_rf, label='RF (area = {0:0.3f})'
''.format(auc(fpr_rf, tpr_rf)))
plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR (area = {0:0.3f})'
''.format(auc(fpr_rf_lm, tpr_rf_lm)))
plt.plot(fpr_grd, tpr_grd, label='GBT (area = {0:0.3f})'
''.format(auc(fpr_grd, tpr_grd)))
plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR (area = {0:0.3f})'
''.format(auc(fpr_grd_lm, tpr_grd_lm)))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (zoomed in at top left)')
plt.legend(loc='best')
plt.show()
compare_trees(X_train, X_test, y_train, y_test)
def parameter_tuning_plot(clf_scores, params, measure, title):
"""
clf_scores - list of scores from the clf classification
params - list of parameters that were used in the tuning
measure - measure to used for the xlabel text
title - test for the title of the plot
parameter_tuning_plot: plot out the CV tuning scores in a line plot
yields: a line plot with scattered points
"""
index = range(1,len(params)+1)
plt.subplots(figsize=(10,8))
score_means = np.mean(clf_scores, axis=1)
score_std = np.std(clf_scores, axis=1)
score_medians = np.median(clf_scores, axis=1)
plt.scatter(index,score_means, c='g', zorder=3, s=100, marker='o', label= 'Mean of ROC AUC scores')
plt.errorbar(index, score_means, yerr = 2*score_std,color='#fcba03', alpha =0.7, capsize=10, elinewidth=4, linestyle="None", zorder = 1, label= 'SE of ROC AUC scores')
plt.title(title)
plt.legend(frameon=False, loc='lower right')
plt.ylabel('ROC AUC scores')
plt.xlabel(measure)
plt.xticks(index, params, rotation=90)
legend = plt.legend(loc='lower center',frameon=True,framealpha=0.6, numpoints=1, scatterpoints=1)
rect = legend.get_frame()
rect.set_facecolor('#D3D3D3')
rect.set_linewidth(0.6)
plt.gca().grid(which='major', axis='y')
plt.show
Params = [ 1.0, 2.0, 5.0, 7.0, 10.0, 15.0, 100.]
N_param = len(Params)
cv = 5
clf_scores = np.zeros((N_param,cv))
cols = 0
for param in Params:
clf = DecisionTreeClassifier(max_depth=param, random_state=RANDOM_STATE)
clf_scores[cols,:] = cross_val_score(clf, X, y, cv=cv, scoring = 'roc_auc' , n_jobs = 8)
cols += 1
parameter_tuning_plot(clf_scores, Params, 'Max depth', 'ROC AUC scores by penalty parameter')
Params = [20, 40, 50, 100, 200, 500]
N_param = len(Params)
cv = 5
clf_scores = np.zeros((N_param,cv))
cols = 0
for param in Params:
clf = BaggingClassifier(n_estimators=param, max_samples=1.0, max_features=0.4, random_state=RANDOM_STATE)
clf_scores[cols,:] = cross_val_score(clf, X, y, cv=cv, scoring = 'roc_auc' )
cols += 1
parameter_tuning_plot(clf_scores, Params, '40 trees 40% max features', 'ROC AUC scores by penalty parameter')
Params = [10, 20, 50, 100, 200]
N_param = len(Params)
cv = 5
clf_scores = np.zeros((N_param,cv))
cols = 0
for param in Params:
clf = GradientBoostingClassifier(n_estimators=param, learning_rate=0.7, max_depth=10, random_state=RANDOM_STATE)
clf_scores[cols,:] = cross_val_score(clf, X, y, cv=cv, scoring = 'roc_auc' )
cols += 1
parameter_tuning_plot(clf_scores, Params, 'Number of trees, learning rate 0.7, max-depth=10', 'ROC AUC scores by penalty parameter')
Params = [50, 100, 300, 500, 1000]
N_param = len(Params)
cv = 5
clf_scores = np.zeros((N_param,cv))
cols = 0
for param in Params:
clf = RandomForestClassifier(n_estimators=param, max_features='auto', random_state=RANDOM_STATE,n_jobs=8)
clf_scores[cols,:] = cross_val_score(clf, X, y, cv=cv, scoring = 'roc_auc' , n_jobs = 8)
cols += 1
parameter_tuning_plot(clf_scores, Params, 'Number of trees in the Random Forest', 'ROC AUC scores by choice of number of trees' )
Params = [20, 30 , 40, 50, 60]
N_param = len(Params)
cv = 5
clf_scores = np.zeros((N_param,cv))
cols = 0
for param in Params:
clf = KNeighborsClassifier(n_neighbors=20)
clf_scores[cols,:] = cross_val_score(clf, X, y, cv=cv, scoring = 'roc_auc' , n_jobs = 8)
cols += 1
parameter_tuning_plot(clf_scores, Params, 'Regularisation Parameter', 'ROC AUC score by degree of regularisation')
Params = [1e-2,1e-1, 1.0, 1e1, 1e2, 1e3]
N_param = len(Params)
cv = 5
clf_scores = np.zeros((N_param,cv))
cols = 0
for param in Params:
clf = LogisticRegression(penalty ='l1', C=param, solver='liblinear', random_state=RANDOM_STATE)
clf_scores[cols,:] = cross_val_score(clf, X, y, cv=cv, scoring = 'roc_auc' , n_jobs = 8)
cols += 1
parameter_tuning_plot(clf_scores, Params, 'Regularisation Parameter', 'ROC AUC score by degree of regularisation')
# Benchmark the classifiers, one at a time
def benchmark(clf, name, X_train, X_dev, y_train, y_dev, predY_dev):
"""
clf - the classifier
name - its name
benchmark: to create the benchmark metrics for the classification
returns: the inputs to the results list
"""
eps = 1e-6
print('_' * 80)
print("Training: ")
print(clf)
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)
t0 = time()
pred = clf.predict(X_dev)
test_time = time() - t0
print("test time: %0.3fs" % test_time)
acc_score = metrics.accuracy_score(y_dev, pred)
print("accuracy: %0.3f" % acc_score)
f1_macro_score = metrics.f1_score(y_dev, pred, average='macro')
print("*** F1 macro avg score: %0.3f" % f1_macro_score)
y_score = clf.predict_proba(X_dev)[:,1]
auc_score = metrics.roc_auc_score(y_dev, y_score)
print("*** AUC for ROC = %0.3f\n" % auc_score)
if hasattr(clf, 'coef_'):
print("dimensionality: %d" % clf.coef_.shape[1])
print("density: %f" % density(clf.coef_))
print("classification report:")
print(metrics.classification_report(y_dev, pred,
target_names=categories))
conf_mat = metrics.confusion_matrix(y_dev, pred)
print("confusion matrix:")
print(conf_mat)
tn, fp, fn, tp = conf_mat.ravel()
sensitivity = tp / (tp+fn) *100.
print("\nsensitivity / recall (TPR):")
print(np.round(sensitivity,3))
specificity = tn / (tn+fp) *100.
print("specificity (TNR):")
print(np.round(specificity,3))
CCR = ( sensitivity + specificity) / 2
print("Correct classification rate (CCR); balanced accuracy:")
print(np.round(CCR,3))
R2 = metrics.r2_score(predY_dev,y_score)
print("R-squared value %F:")
print(np.round(R2,3))
mae = metrics.mean_absolute_error(predY_dev,y_score)
print("MAE value %F:")
print(np.round(mae,3))
R2 = metrics.r2_score(log_ratio(predY_dev, eps), log_ratio(y_score, eps))
print("R-squared value logK(%F):")
print(np.round(R2,3))
mae = metrics.mean_absolute_error(log_ratio(predY_dev, eps), log_ratio(y_score, eps))
print("MAE value logK(%F):")
print(np.round(mae,3))
print()
clf_descr = str(clf).split('(')[0]
return name, acc_score, f1_macro_score, auc_score, train_time, test_time
results_test = []
dtc = DecisionTreeClassifier(max_depth=5, random_state=RANDOM_STATE)
results_test.append(benchmark(dtc, 'Decision Tree classifier with 2 trees', X_train, X_test, y_train, y_test, predY_test))
bc = BaggingClassifier(n_estimators=500, max_samples=1.0, max_features=0.4, random_state=RANDOM_STATE)
results_test.append(benchmark(bc, "Bagging | 500 trees 40% max features", X_train, X_test, y_train, y_test, predY_test))
rf = RandomForestClassifier(n_estimators=1000, max_features='auto', random_state=RANDOM_STATE)
results_test.append(benchmark(rf, "Random Forest | 1000 trees", X_train, X_test, y_train, y_test, predY_test))
comparison_plots(results_test)
def class_scores(y_dev, pred):
"""
To print out metrics for classifier results
y: True labels or binary label indicators
pred: Target scores. In the binary and multilabel cases, these can be either
probability estimates or non-thresholded decision values (as returned by
decision_function on some classifiers). In the multiclass case, these must
be probability estimates which sum to 1.
"""
eps = 1e-6
print('_' * 80)
y_pred = (pred >= 0.5).astype(bool)
acc_score = metrics.accuracy_score(y_dev, y_pred)
print("accuracy: %0.3f" % acc_score)
f1_macro_score = metrics.f1_score(y_dev, y_pred, average='macro')
print("*** F1 macro avg score: %0.3f" % f1_macro_score)
auc_score = metrics.roc_auc_score(y_dev, pred)
print("*** AUC for ROC = %0.3f\n" % auc_score)
print("classification report:")
print(metrics.classification_report(y_dev, y_pred,
target_names=categories))
conf_mat = metrics.confusion_matrix(y_dev, y_pred)
print("confusion matrix:")
print(conf_mat)
tn, fp, fn, tp = conf_mat.ravel()
sensitivity = tp / (tp+fn) *100.
print("\nsensitivity / recall (TPR):")
print(np.round(sensitivity,3))
specificity = tn / (tn+fp) *100.
print("specificity (TNR):")
print(np.round(specificity,3))
CCR = ( sensitivity + specificity) / 2
print("Correct classification rate (CCR):")
print(np.round(CCR,3))
R2 = metrics.r2_score(y_dev,pred)
print("R-squared value %F:")
print(np.round(R2,3))
mae = metrics.mean_absolute_error(y_dev,pred)
print("MAE value %F:")
print(np.round(mae,3))
R2 = metrics.r2_score(log_ratio(y_dev, eps), log_ratio(pred, eps))
print("R-squared value logK(%F):")
print(np.round(R2,3))
mae = metrics.mean_absolute_error(log_ratio(y_dev, eps), log_ratio(pred, eps))
print("MAE value logK(%F):")
print(np.round(mae,3))
from sklearn.metrics import balanced_accuracy_score
from imblearn.ensemble import RUSBoostClassifier
rusboost = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R',
random_state=RANDOM_STATE)
rusboost.fit(X_train, y_train)
y_pred = rusboost.predict(X_test)
class_scores(y_test, y_pred)
from imblearn.ensemble import EasyEnsembleClassifier
easy = EasyEnsembleClassifier(n_estimators=200, random_state=RANDOM_STATE)
easy.fit(X_train, y_train)
y_pred = easy.predict(X_test)
class_scores(y_test, y_pred)
from imblearn.metrics import sensitivity_score, specificity_score
CCR = lambda y, y_pred: (sensitivity_score(y, y_pred) + specificity_score(y, y_pred)) / 2.
def plot_gridsearchCV(results, scoring, param='param_min_samples_split'):
plt.figure(figsize=(13, 13))
plt.title("GridSearchCV evaluating using multiple scorers simultaneously",
fontsize=16)
plt.xlabel(param)
plt.ylabel("Score")
ax = plt.gca()
ax.set_xlim(0,1010)
ax.set_ylim(0.50, 1)
# Get the regular numpy array from the MaskedArray
X_axis = np.array(results[param].data, dtype=float)
for scorer, color in zip(sorted(scoring), ['g', 'k', 'r', 'b']):
for sample, style in (('train', '--'), ('test', '-')):
sample_score_mean = results['mean_%s_%s' % (sample, scorer)]
sample_score_std = results['std_%s_%s' % (sample, scorer)]
ax.fill_between(X_axis, sample_score_mean - sample_score_std,
sample_score_mean + sample_score_std,
alpha=0.1 if sample == 'test' else 0, color=color)
ax.plot(X_axis, sample_score_mean, style, color=color,
alpha=1 if sample == 'test' else 0.7,
label="%s (%s)" % (scorer, sample))
best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0]
best_score = results['mean_test_%s' % scorer][best_index]
# Plot a dotted vertical line at the best score for that scorer marked by x
ax.plot([X_axis[best_index], ] * 2, [0, best_score],
linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8)
# Annotate the best score for that scorer
ax.annotate("%0.2f" % best_score,
(X_axis[best_index], best_score + 0.005))
plt.legend(loc="best")
plt.grid(False)
plt.show()
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from imblearn.metrics import sensitivity_score, specificity_score
from sklearn.metrics import r2_score, mean_absolute_error
CCR = lambda y, y_pred: (sensitivity_score(y, y_pred) + specificity_score(y, y_pred)) / 2.
# The scorers can be either be one of the predefined metric strings or a scorer
# callable, like the one returned by make_scorer
scoring = {'AUC': 'roc_auc', 'Sensitivity': make_scorer(sensitivity_score), 'Specificity': make_scorer(specificity_score), 'CCR': make_scorer(CCR)}
calibrated_forest = CalibratedClassifierCV(
base_estimator=RandomForestClassifier(max_depth=10, random_state=RANDOM_STATE))
param_grid = {
'base_estimator__n_estimators': [ 10, 20, 50, 100, 200, 500, 1000]}
search = GridSearchCV(calibrated_forest, param_grid, cv=5, scoring=scoring, refit='AUC', return_train_score=True)
search.fit(X, y)
results = search.cv_results_
plot_gridsearchCV(results, scoring, 'param_base_estimator__n_estimators')
calibrated_forest = CalibratedClassifierCV(
base_estimator=BaggingClassifier(max_samples=1.0, max_features=0.4, random_state=RANDOM_STATE))
param_grid = {
'base_estimator__n_estimators': [ 10, 20, 50, 100, 200, 500, 1000]}
search = GridSearchCV(calibrated_forest, param_grid, cv=5, scoring=scoring, refit='AUC', return_train_score=True)
search.fit(X, y)
results = search.cv_results_
plot_gridsearchCV(results, scoring, 'param_base_estimator__n_estimators')
descriptors = df.apply(lambda x: QED.properties(x['ROMol']), axis=1).apply(pd.Series)
descriptors.columns = ['MW', 'ALOGP', 'HBA', 'HBD', 'PSA', 'ROTB', 'AROM', 'ALERTS']
descriptors.head()
# Compute the correlation matrix
corr = descriptors.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(16, 16))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
cols_to_drop = ['ID', 'Name', 'Smiles', 'ROMol', 'QED', 'mol-sentence']
predY = df['PercentF'].astype('float32')
classY = df['Class']
X_df = df.drop(cols_to_drop, axis=1)
X_df = pd.concat([descriptors, X_df], axis=1)
X_df.head()
X_ = np.concatenate((X, descriptors.values), axis=1)
X_.shape
calibrated_forest = CalibratedClassifierCV(
base_estimator=RandomForestClassifier(max_depth=10, random_state=RANDOM_STATE))
param_grid = {
'base_estimator__n_estimators': [ 10, 20, 50, 100, 200, 500, 1000]}
search = GridSearchCV(calibrated_forest, param_grid, cv=5, scoring=scoring, refit='AUC', return_train_score=True)
search.fit(X_, y)
results = search.cv_results_
plot_gridsearchCV(results, scoring, 'param_base_estimator__n_estimators')
calibrated_forest = CalibratedClassifierCV(
base_estimator=BaggingClassifier(max_samples=1.0, max_features=0.4, random_state=RANDOM_STATE))
param_grid = {
'base_estimator__n_estimators': [ 10, 20, 50, 100, 200, 500, 1000]}
search = GridSearchCV(calibrated_forest, param_grid, cv=5, scoring=scoring, refit='AUC', return_train_score=True)
search.fit(X_, y)
results = search.cv_results_
plot_gridsearchCV(results, scoring, 'param_base_estimator__n_estimators')
calibrated_forest = CalibratedClassifierCV(
base_estimator=RandomForestClassifier(max_depth=10, random_state=RANDOM_STATE))
param_grid = {
'base_estimator__n_estimators': [ 10, 20, 50, 100, 200, 500, 1000]}
search = GridSearchCV(calibrated_forest, param_grid, cv=5, scoring=scoring, refit='AUC', return_train_score=True)
search.fit(descriptors.values, y)
results = search.cv_results_
plot_gridsearchCV(results, scoring, 'param_base_estimator__n_estimators')
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
pipe = Pipeline([
('select', SelectKBest()),
('model', calibrated_forest)])
param_grid = {
'select__k': [50, 100, 300],
'model__base_estimator__n_estimators': [50, 100, 200]}
search = GridSearchCV(pipe, param_grid, cv=5, scoring=scoring, refit='AUC', return_train_score=True)
search.fit(X, y)
results = search.cv_results_
results
import xgboost as xgb
def rmse(predictions, targets):
return np.sqrt(((predictions - targets) ** 2).mean())
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 2
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)
#bst = xgb.train(param, dtrain, num_round)
# make prediction
#y_predicted = bst.predict(dtest)
res = xgb.cv(param, dtrain, num_boost_round=1000, nfold=5, seed=RANDOM_STATE, stratified=False,
early_stopping_rounds=25, verbose_eval=10, show_stdv=True)
best_nrounds = res.shape[0] - 1
print(np.shape(X_train), np.shape(X_test), np.shape(y_train), np.shape(y_test))
gbdt = xgb.train(param, dtrain, best_nrounds)
y_predicted = gbdt.predict(dtest)
class_scores(y_test, y_predicted)
# Setting refit='AUC', refits an estimator on the whole dataset with the
# parameter setting that has the best cross-validated AUC score.
# That estimator is made available at ``gs.best_estimator_`` along with
# parameters like ``gs.best_score_``, ``gs.best_params_`` and
# ``gs.best_index_``
gs = GridSearchCV(DecisionTreeClassifier(random_state=42),
param_grid={'min_samples_split': range(2, 403, 10)},
scoring=scoring, refit='AUC', return_train_score=True)
gs.fit(X, y)
results = gs.cv_results_
plot_gridsearchCV(results, scoring)
def input_test_data(in_file_pos, in_file_neg, out_file):
df_pos = pd.read_csv(in_file_pos, delimiter=',', usecols=[0, 1, 2], names=['ID', 'Name', 'Smiles'], encoding='latin-1') # Assume <tab> separated
df_neg = pd.read_csv(in_file_neg, delimiter=',', usecols=[0, 1, 2], names=['ID', 'Name', 'Smiles'], encoding='latin-1') # Assume <tab> separated
df_neg['Class'] = 0
print('Negative dataframe columns')
print(df_neg.columns)
df_pos['Class'] = 1
print('Postive dataframe columns')
print(df_pos.columns)
df = pd.concat([df_neg, df_pos])
df.reset_index(drop=True, inplace=True)
df_input = df
model_path = './models/model_300dim.pkl'
X, df = featurize(df, out_file, model_path, 2, uncommon='UNK')
descriptors = df.apply(lambda x: QED.properties(x['ROMol']), axis=1).apply(pd.Series)
descriptors.columns = ['MW', 'ALOGP', 'HBA', 'HBD', 'PSA', 'ROTB', 'AROM', 'ALERTS']
cols_to_drop = ['ID', 'Name', 'Smiles', 'Class', 'ROMol', 'QED', 'mol-sentence']
classY = df['Class']
df = df.drop(cols_to_drop, axis=1)
df = pd.concat([descriptors, df], axis=1)
X = df.values
return X, df, classY, df_input
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from sklearn.metrics import plot_confusion_matrix
import umap.umap_ as UMAP
def plot_scatter(test_X, test_classY):
#pca_model = PCA(n_components=30)
scaler = StandardScaler()
pca_model = PCA(n_components=2)
umap_model = UMAP.UMAP()
pca = pca_model.fit_transform(scaler.fit_transform(test_X))
umap = umap_model.fit_transform(scaler.fit_transform(test_X))
df_vec = pd.DataFrame()
df_vec['PCA-c1'] = pca.T[0]
df_vec['PCA-c2'] = pca.T[1]
df_vec['UMAP-c1'] = umap.T[0]
df_vec['UMAP-c2'] = umap.T[1]
df_vec['Class'] = ['Positive dataset' if x == 1
else 'Negative dataset' for x in test_classY.tolist()]
f, ax = plt.subplots(figsize=(8, 8))
sns.scatterplot('UMAP-c1','UMAP-c2', hue='Class', data=df_vec, legend='full', palette=palette)
plt.show()
return df_vec
def run_classifier (clf, X_train, X_dev, y_train, y_test):
"""
clf - the classifier
returns: the predictions and the probabilities
"""
print('_' * 80)
print("Training: ")
print(clf)
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)
pred = clf.predict(X_dev)
prob = clf.predict_proba(X_dev)[:,1]
df_vec = pd.DataFrame()
df_vec['Class'] = ['Positive dataset' if x == 1
else 'Negative dataset' for x in y_test.tolist()]
df_vec['Prediction'] = pred
df_vec['Probability'] = prob
return pred, prob, df_vec
def plot_violin(test_classY, pred, prob):
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(8, 8))
df_vec = pd.DataFrame()
df_vec['Class'] = ['Positive dataset' if x == 1
else 'Negative dataset' for x in test_classY.tolist()]
df_vec['Prediction'] = pred
df_vec['Probability'] = prob
sns.violinplot(x="Class", y="Probability", data=df_vec, palette="muted")
plt.show()
def plot_stripplot(test_classY, pred, prob):
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(8, 8))
df_vec = pd.DataFrame()
df_vec['Class'] = ['Positive dataset' if x == 1
else 'Negative dataset' for x in test_classY.tolist()]
df_vec['Prediction'] = pred
df_vec['Probability'] = prob
sns.stripplot(x="Class", y="Probability", data=df_vec, palette="muted", jitter=0.10)
plt.show()
clf = BaggingClassifier(n_estimators=50, max_samples=1.0, max_features=0.4, random_state=RANDOM_STATE)
clf = RandomForestClassifier(n_estimators=500, max_depth=10, max_features='auto', random_state=RANDOM_STATE)
df_scat = plot_scatter(X_test, y_test)
pred, prob, df_vec = run_classifier (clf, X_train, X_test, y_train, y_test)
plt.subplots(figsize=(8, 8))
sns.violinplot(x="Class", y="Probability", data=df_vec, palette="muted")
plt.show()
plt.subplots(figsize=(8, 8))
sns.stripplot(x="Class", y="Probability", data=df_vec, palette="muted", jitter=0.10)
plt.show()
in_file_pos = './BA-Chembl-4-phases-smiles-no-repeat.csv'
in_file_neg = './hsbd-smiles-no-repeat.csv'
out_file = 'BA-vectors.csv'
test_X, test_df, test_classY = input_test_data(in_file_pos, in_file_neg, out_file)
test_X = test_X[:,8:]
clf = BaggingClassifier(n_estimators=50, max_samples=1.0, max_features=0.4, random_state=RANDOM_STATE)
clf = RandomForestClassifier(n_estimators=500, max_features='auto', random_state=RANDOM_STATE)
df_scat = plot_scatter(test_X, test_classY)
pred, prob, df_vec = run_classifier (clf, X, test_X, y, test_classY)
plt.subplots(figsize=(8, 8))
sns.violinplot(x="Class", y="Probability", data=df_vec, palette="muted")
plt.show()
plt.subplots(figsize=(8, 8))
sns.stripplot(x="Class", y="Probability", data=df_vec, palette="muted", jitter=0.10)
plt.show()
title = 'Confusion matrix with input classes'
disp = plot_confusion_matrix(clf, test_X, test_classY,
display_labels=[0,1],
cmap=plt.cm.Blues)
disp.ax_.set_title(title)
print()
print(disp.confusion_matrix)
plt.show()
def input_single_test_data(in_file, out_file):
df = pd.read_csv(in_file, delimiter=',', usecols=[0, 1], names=['Name', 'Smiles'], encoding='latin-1') # Assume <tab> separated
df_input = df
model_path = './models/model_300dim.pkl'
X, df = featurize(df, out_file, model_path, 2, uncommon='UNK')
descriptors = df.apply(lambda x: QED.properties(x['ROMol']), axis=1).apply(pd.Series)
descriptors.columns = ['MW', 'ALOGP', 'HBA', 'HBD', 'PSA', 'ROTB', 'AROM', 'ALERTS']
cols_to_drop = ['Name', 'Smiles', 'ROMol', 'QED', 'mol-sentence']
df = df.drop(cols_to_drop, axis=1)
df = pd.concat([descriptors, df], axis=1)
X = df.values
return X, df, df_input
in_file = './All_curated.csv'
out_file = 'All_curated-vectors.csv'
test_X, test_df, df_input = input_single_test_data(in_file, out_file)
test_X = test_X[:,8:]
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from sklearn.metrics import plot_confusion_matrix
import umap.umap_ as UMAP
def plot_single_scatter(test_X):
#pca_model = PCA(n_components=30)
scaler = StandardScaler()
pca_model = PCA(n_components=2)
umap_model = UMAP.UMAP()
pca = pca_model.fit_transform(scaler.fit_transform(test_X))
umap = umap_model.fit_transform(scaler.fit_transform(test_X))
df_vec = pd.DataFrame()
df_vec['PCA-c1'] = pca.T[0]
df_vec['PCA-c2'] = pca.T[1]
df_vec['UMAP-c1'] = umap.T[0]
df_vec['UMAP-c2'] = umap.T[1]
f, ax = plt.subplots(figsize=(8, 8))
sns.scatterplot('UMAP-c1','UMAP-c2',data=df_vec, legend='full', palette=palette)
plt.show()
return df_vec
def run_single_classifier (clf, X_train, X_dev, y_train):
"""
clf - the classifier
returns: the predictions and the probabilities
"""
print('_' * 80)
print("Training: ")
print(clf)
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)
pred = clf.predict(X_dev)
prob = clf.predict_proba(X_dev)[:,1]
df_vec = pd.DataFrame()
df_vec['Class'] = ['Positive dataset' if x == 1
else 'Negative dataset' for x in pred.tolist()]
df_vec['Prediction'] = pred
df_vec['Probability'] = prob
return pred, prob, df_vec
clf = RandomForestClassifier(n_estimators=500, max_features='auto', random_state=RANDOM_STATE)
df_scat = plot_single_scatter(test_X)
pred, prob, df_vec = run_single_classifier (clf, X, test_X, y)
plt.subplots(figsize=(8, 8))
sns.violinplot(x="Class", y="Probability", data=df_vec, palette="muted")
plt.show()
plt.subplots(figsize=(8, 8))
sns.stripplot(x="Class", y="Probability", data=df_vec, palette="muted", jitter=0.10)
plt.show()
df_scat.columns
df_vec['ID'] = df_input['ID']
df_vec['Name'] = df_input['Name']
df_vec['Smiles'] = df_input['Smiles']
df_vec['x-coords'] = df_scat['UMAP-c1']
df_vec['y-coords'] = df_scat['UMAP-c2']
df_vec.head(15)
df_vec.to_csv('probabilities_BA_prediction_curated.csv',index=False)
in_file_pos = './BA-curated-descriptors.csv'
in_file_neg = './non-BA-curated-descriptors.csv'
out_file = 'BA-curated-descriptors-vectors.csv'
test_X, test_df, test_classY, df_input = input_test_data(in_file_pos, in_file_neg, out_file)
test_X = test_X[:,8:]
clf = RandomForestClassifier(n_estimators=500, max_features='auto', random_state=RANDOM_STATE)
df_scat = plot_scatter(test_X, test_classY)
pred, prob, df_vec = run_classifier (clf, X, test_X, y, test_classY)
plt.subplots(figsize=(8, 8))
sns.violinplot(x="Class", y="Probability", data=df_vec, palette="muted")
plt.show()
plt.subplots(figsize=(8, 8))
sns.stripplot(x="Class", y="Probability", data=df_vec, palette="muted", jitter=0.10)
plt.show()
title = 'Confusion matrix with input classes'
disp = plot_confusion_matrix(clf, test_X, test_classY,
display_labels=[0,1],
cmap=plt.cm.Blues)
disp.ax_.set_title(title)
print()
print(disp.confusion_matrix)
plt.show()
df_scat.columns
df_vec['ID'] = df_input['ID']
df_vec['Name'] = df_input['Name']
df_vec['Smiles'] = df_input['Smiles']
df_vec['x-coords'] = df_scat['UMAP-c1']
df_vec['y-coords'] = df_scat['UMAP-c2']
df_vec.head(15)
df_vec.to_csv('probabilities_BA_prediction_curated.csv',index=False)
in_file_pos = './BA-Chembl-4-phases-smiles-no-repeat.csv'
in_file_neg = './non-BA-Chembl-phase0-descriptors.csv'
out_file = 'BA-chembl-phases-descriptors-vectors.csv'
test_X, test_df, test_classY = input_test_data(in_file_pos, in_file_neg, out_file)
test_X = test_X[:,8:]
clf = BaggingClassifier(n_estimators=50, max_samples=1.0, max_features=0.4, random_state=RANDOM_STATE)
clf = RandomForestClassifier(n_estimators=500, max_features='auto', random_state=RANDOM_STATE)
plot_scatter(test_X, test_classY)
pred, prob, df_vec = run_classifier (clf, X, test_X, y, test_classY)
plt.subplots(figsize=(8, 8))
sns.violinplot(x="Class", y="Probability", data=df_vec, palette="muted")
plt.show()
plt.subplots(figsize=(8, 8))
sns.stripplot(x="Class", y="Probability", data=df_vec, palette="muted", jitter=0.10)
plt.show()
title = 'Confusion matrix with input classes'
disp = plot_confusion_matrix(clf, test_X, test_classY,
display_labels=[0,1],
cmap=plt.cm.Blues)
disp.ax_.set_title(title)
print()
print(disp.confusion_matrix)
plt.show()
in_file_pos = './BA-Chembl-4-phases-smiles-no-repeat.csv'
in_file_neg = './non-BA-qed-ro5-Chembl-smiles.csv'
out_file = 'non-BA-qed-ro5-descriptors-vectors.csv'
test_X, test_df, test_classY = input_test_data(in_file_pos, in_file_neg, out_file)
test_X = test_X[:,8:]
clf = BaggingClassifier(n_estimators=50, max_samples=1.0, max_features=0.4, random_state=RANDOM_STATE)
clf = RandomForestClassifier(n_estimators=500, max_features='auto', random_state=RANDOM_STATE)
plot_scatter(test_X, test_classY)
pred, prob, df_vec = run_classifier (clf, X, test_X, y, test_classY)
plt.subplots(figsize=(8, 8))
sns.violinplot(x="Class", y="Probability", data=df_vec, palette="muted")
plt.show()
plt.subplots(figsize=(8, 8))
sns.stripplot(x="Class", y="Probability", data=df_vec, palette="muted", jitter=0.10)
plt.show()
title = 'Confusion matrix with input classes'
disp = plot_confusion_matrix(clf, test_X, test_classY,
display_labels=[0,1],
cmap=plt.cm.Blues)
disp.ax_.set_title(title)
print()
print(disp.confusion_matrix)
plt.show()
in_file_pos = './BA-Chembl-4-phases-smiles-no-repeat.csv'
in_file_neg = './hsbd-smiles-no-repeat.csv'
out_file = 'BA-vectors.csv'
test_X, test_df, test_classY = input_test_data(in_file_pos, in_file_neg, out_file)
clf = BaggingClassifier(n_estimators=50, max_samples=1.0, max_features=0.4, random_state=RANDOM_STATE)
clf = RandomForestClassifier(n_estimators=500, max_features='auto', random_state=RANDOM_STATE)
plot_scatter(test_X, test_classY)
pred, prob, df_vec = run_classifier (clf, X_, test_X, y, test_classY)
plt.subplots(figsize=(8, 8))
sns.violinplot(x="Class", y="Probability", data=df_vec, palette="muted")
plt.show()
plt.subplots(figsize=(8, 8))
sns.stripplot(x="Class", y="Probability", data=df_vec, palette="muted", jitter=0.10)
plt.show()
title = 'Confusion matrix with input classes'
disp = plot_confusion_matrix(clf, test_X, test_classY,
display_labels=[0,1],
cmap=plt.cm.Blues)
disp.ax_.set_title(title)
print()
print(disp.confusion_matrix)
plt.show()
in_file_pos = './BA-curated-descriptors.csv'
in_file_neg = './non-BA-curated-descriptors.csv'
out_file = 'BA-curated-descriptors-vectors.csv'
test_X, test_df, test_classY = input_test_data(in_file_pos, in_file_neg, out_file)
clf = BaggingClassifier(n_estimators=50, max_samples=1.0, max_features=0.4, random_state=RANDOM_STATE)
clf = RandomForestClassifier(n_estimators=500, max_features='auto', random_state=RANDOM_STATE)
plot_scatter(test_X, test_classY)
pred, prob, df_vec = run_classifier (clf, X_, test_X, y, test_classY)
plt.subplots(figsize=(8, 8))
sns.violinplot(x="Class", y="Probability", data=df_vec, palette="muted")
plt.show()
plt.subplots(figsize=(8, 8))
sns.stripplot(x="Class", y="Probability", data=df_vec, palette="muted", jitter=0.10)
plt.show()
title = 'Confusion matrix with input classes'
disp = plot_confusion_matrix(clf, test_X, test_classY,
display_labels=[0,1],
cmap=plt.cm.Blues)
disp.ax_.set_title(title)
print()
print(disp.confusion_matrix)
plt.show()
in_file_pos = './BA-Chembl-4-phases-smiles-no-repeat.csv'
in_file_neg = './non-BA-Chembl-phase0-descriptors.csv'
out_file = 'BA-chembl-phases-descriptors-vectors.csv'
test_X, test_df, test_classY = input_test_data(in_file_pos, in_file_neg, out_file)
clf = BaggingClassifier(n_estimators=50, max_samples=1.0, max_features=0.4, random_state=RANDOM_STATE)
clf = RandomForestClassifier(n_estimators=500, max_features='auto', random_state=RANDOM_STATE)
plot_scatter(test_X, test_classY)
pred, prob, df_vec = run_classifier (clf, X_, test_X, y, test_classY)
plt.subplots(figsize=(8, 8))
sns.violinplot(x="Class", y="Probability", data=df_vec, palette="muted")
plt.show()
plt.subplots(figsize=(8, 8))
sns.stripplot(x="Class", y="Probability", data=df_vec, palette="muted", jitter=0.10)
plt.show()
title = 'Confusion matrix with input classes'
disp = plot_confusion_matrix(clf, test_X, test_classY,
display_labels=[0,1],
cmap=plt.cm.Blues)
disp.ax_.set_title(title)
print()
print(disp.confusion_matrix)
plt.show()
in_file_pos = './BA-Chembl-4-phases-smiles-no-repeat.csv'
in_file_neg = './non-BA-qed-ro5-Chembl-smiles.csv'
out_file = 'non-BA-qed-ro5-descriptors-vectors.csv'
test_X, test_df, test_classY = input_test_data(in_file_pos, in_file_neg, out_file)
clf = BaggingClassifier(n_estimators=50, max_samples=1.0, max_features=0.4, random_state=RANDOM_STATE)
clf = RandomForestClassifier(n_estimators=500, max_features='auto', random_state=RANDOM_STATE)
plot_scatter(test_X, test_classY)
pred, prob, df_vec = run_classifier (clf, X_, test_X, y, test_classY)
plt.subplots(figsize=(8, 8))
sns.violinplot(x="Class", y="Probability", data=df_vec, palette="muted")
plt.show()
plt.subplots(figsize=(8, 8))
sns.stripplot(x="Class", y="Probability", data=df_vec, palette="muted", jitter=0.10)
plt.show()
title = 'Confusion matrix with input classes'
disp = plot_confusion_matrix(clf, test_X, test_classY,
display_labels=[0,1],
cmap=plt.cm.Blues)
disp.ax_.set_title(title)
print()
print(disp.confusion_matrix)
plt.show()
RANDOM_STATE = 458
TEST_SIZE = 0.2
X_df_ = shuffle(X_df, random_state=RANDOM_STATE)
train, test = train_test_split(X_df_, test_size=TEST_SIZE, random_state=RANDOM_STATE)
test, val = train_test_split(test, test_size=0.5, random_state=RANDOM_STATE)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow import feature_column
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
dataframe = dataframe.copy()
labels = dataframe.pop('Class')
dataframe = dataframe.drop(['PercentF'], axis=1)
ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
if shuffle:
ds = ds.shuffle(buffer_size=len(dataframe))
ds = ds.batch(batch_size)
return ds
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)
example_batch = next(iter(train_ds))[0]
#for feature_batch, label_batch in train_ds.take(1):
# print('Every feature:', list(feature_batch.keys()))
# print('A batch of ages:', feature_batch['MW'])
# print('A batch of targets:', label_batch )
def demo(feature_column):
feature_layer = layers.DenseFeatures(feature_column)
print(feature_layer(example_batch).numpy())
#MW = feature_column.numeric_column("AROM")
#demo(MW)
feature_columns = []
# numeric cols
for header in ['MW', 'ALOGP', 'PSA']:
feature_columns.append(feature_column.numeric_column(header))
for header in ['HBA', 'HBD', 'ROTB', 'ALERTS']:
feature_columns.append(feature_column.numeric_column(header))
feature_columns.append(feature_column.numeric_column('AROM'))
for header in X_df.columns[10:].tolist():
feature_columns.append(feature_column.numeric_column(header))
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)
model = tf.keras.Sequential([
feature_layer,
layers.Dense(256, activation='relu'),
layers.Dense(128, activation='relu'),
layers.Dense(128, activation='relu'),
layers.Dense(1)
])
model.compile(optimizer='adam',
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
model.fit(train_ds,
validation_data=val_ds,
epochs=100)
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)
model = keras.Sequential([
keras.layers.Flatten(input_shape=(300,)),
keras.layers.Dense(200, activation=tf.nn.relu),
keras.layers.Dense(128, activation=tf.nn.relu),
keras.layers.Dense(56, activation=tf.nn.relu),
keras.layers.Dense(1, activation=tf.nn.sigmoid),
])
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
model.fit(X_train, y_train, epochs=200, batch_size=30)
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.graphics.api import qqplot
df.columns
lm = LinearRegression()
cols_to_drop = ['ID', 'Name', 'Smiles', 'PercentF', 'Class', 'ROMol', 'QED', 'mol-sentence']
Y = df['PercentF'].astype('float32')
X = df.drop(cols_to_drop, axis=1)
lmfit = lm.fit(X, Y)
residuals = Y - lm.predict(X)
print("RSS : ", np.sum(residuals**2))
print("MSE : ", np.mean(residuals**2))
#Residuals are almost same as statsmodel.OLS
from sklearn.feature_selection import RFE #Recursive(Backward) feature selection, takes all features and prunes out recursively.
rfe = RFE(estimator=lm, n_features_to_select=10, step=1)
rfe.fit(X, Y)
best_features = np.where(rfe.get_support())[0]
best_cols_names = [X.columns[i] for i in best_features]
best_cols_names
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
skb = SelectKBest(f_regression, k=10)
skb.fit(X, Y)
best_features = np.where(skb.get_support())[0]
best_cols_names = [X.columns[i] for i in best_features]
best_cols_names
X_c = sm.add_constant(X)
results = sm.OLS(Y,X_c).fit()
#Check if residuals are normally distributed
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.xlim(-30,30)
plt.grid(True)
plt.hist(results.resid)
plt.show()
fig = sm.qqplot(results.resid, fit=True, line='r')
plt.show()
plt.plot(results.fittedvalues, results.resid,'o')
plt.hlines(xmin=np.min(results.fittedvalues),xmax=np.max(results.fittedvalues),y=0)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.show()
from statsmodels.graphics.regressionplots import plot_leverage_resid2
fig = plot_leverage_resid2(results)
plt.show()
from statsmodels.graphics.regressionplots import plot_leverage_resid2
fig = plot_leverage_resid2(results)
plt.show()
# Cooks distance - is also used estimate of the influence of a data point
influence = results.get_influence()
#c is the distance and p is p-value
(c, p) = influence.cooks_distance
plt.annotate("368",(368,c[368]))
plt.annotate("372",(372,c[372]))
plt.stem(np.arange(len(c)), c, markerfmt=",")
plt.show()
in_file = './sample-set.csv'
df_test = pd.read_csv(in_file, delimiter=',', usecols=[0, 1, 2, 3], names=['ID', 'Smiles', 'Name', 'PercentF'], header=None,encoding='latin-1') # Assume <tab> separated
df_test['Name'] = df_test['Name'].str.upper()
df_test
in_file = './sample-smiles.csv'
df_perc = pd.read_csv(in_file, delimiter=',', usecols=[0, 1, 2, 3], names=['ID', 'Name_alt', 'Name', 'Smiles'], header=None,encoding='latin-1') # Assume <tab> separated
df_perc
df_final = df_test.merge(df_perc, on='Name', how='inner', suffixes=('_1', '_2'))[['Name', 'PercentF', 'ID_2', 'Smiles_2']]
df_final = df_final.rename(columns={'ID_2': 'ID', 'Smiles_2': 'Smiles'})
df_final
model_path = './models/model_300dim.pkl'
out_file = 'BA-test-vectors.csv'
X_test, df_test = featurize(df_final, out_file, model_path, 2, uncommon='UNK')
cols_to_drop = ['ID', 'Name', 'Smiles', 'PercentF', 'ROMol', 'QED', 'mol-sentence']
Y_test = df_test['PercentF'].astype('float32')
X_test = df_test.drop(cols_to_drop, axis=1)
residuals = Y_test - lm.predict(X_test)
print("RSS : ", np.sum(residuals**2))
print("MSE : ", np.mean(residuals**2))
plt.plot(lm.predict(X_test), residuals,'o')
plt.hlines(xmin=np.min(lm.predict(X_test)),xmax=np.max(lm.predict(X_test)),y=0)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.show()
lm.predict(X_test)
preds = lm.predict(X_test)
index = range(len(preds))
plt.plot(index, preds, 'o')
plt.plot(index, Y_test,'x')
plt.vlines(ymin=np.min(preds),ymax=np.max(preds),x=0)
plt.hlines(xmin=np.min(index),xmax=np.max(index),y=0)
plt.xlabel('Example')
plt.ylabel('True Values:x, Prediction:o ')
plt.show()